/**
 * Copyright (C) 2007 Stefan Buettcher. All rights reserved.
 * This is free software with ABSOLUTELY NO WARRANTY.
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
 * 02111-1307, USA
 **/

/**
 * author: Stefan Buettcher
 * created: 2006-09-06
 * changed: 2007-07-05
 **/


#include <math.h>
#include <string.h>
#include "languagemodel_query.h"
#include "getquery.h"
#include "querytokenizer.h"
#include "../filters/inputstream.h"
#include "../filters/xml_inputstream.h"
#include "../indexcache/extentlist_cached.h"
#include "../misc/all.h"


static const char *LOG_ID = "LanguageModelQuery";

static const char *DEFAULT_SMOOTHING = "dirichlet";


void LanguageModelQuery::initialize(Index *index, char *command, char **modifiers,
		char *body, VisibleExtents *visibleExtents, int memoryLimit) {
	this->index = index;
	this->visibleExtents = visibleExtents;
	this->memoryLimit = memoryLimit;
	processModifiers(modifiers);
	queryString = duplicateString(body);
	actualQuery = this;
	ok = false;
} // end of initialize(...)


LanguageModelQuery::LanguageModelQuery(Index *index, char *command, char **modifiers, char *body,
		VisibleExtents *visibleExtents, int memoryLimit) {
	initialize(index, command, modifiers, body, visibleExtents, memoryLimit);
	mustFreeVisibleExtentsInDestructor = false;
} // end of LanguageModelQuery(...)


LanguageModelQuery::LanguageModelQuery(Index *index, char *command, char **modifiers, char *body,
		uid_t userID, int memoryLimit) {
	this->userID = userID;
	VisibleExtents *visibleExtents = index->getVisibleExtents(userID, false);
	initialize(index, command, modifiers, body, visibleExtents, memoryLimit);
	mustFreeVisibleExtentsInDestructor = true;
} // end of LanguageModelQuery(...)


LanguageModelQuery::~LanguageModelQuery() {
} // end of ~LanguageModelQuery()


void LanguageModelQuery::processCoreQuery() {
	offset start, end;
	double freq[MAX_SCORER_COUNT], df[MAX_SCORER_COUNT];
	double avgTF[MAX_SCORER_COUNT], avgDensity[MAX_SCORER_COUNT];
	ExtentList *elementLists[MAX_SCORER_COUNT];
	ExtentList *containerList = containerQuery->getResult();
	ExtentList *statsList = statisticsQuery->getResult();

	// compute collection statistics and global probability distributions
	double corpusSize = statsList->getTotalSize();
	double p_global[MAX_SCORER_COUNT];
	double totalTermWeight = 0;
	for (int i = 0; i < elementCount; i++) {
		// count the total number of occurrences of the i-th query term in
		// the portion of the address space covered by statisticsQuery
		elementLists[i] = elementQueries[i]->getResult();
		ExtentList_Containment c(statsList, elementLists[i], false, false);
		freq[i] = c.getLength();
		if (freq[i] == 0)
			freq[i] = corpusSize - 1;
		p_global[i] = (freq[i] + 0.5) / corpusSize;
		totalTermWeight += -log(p_global[i]) * externalWeights[i];
		c.detachSubLists();
	}

	// initialize heap structure
	ScoredExtent candidate;
	results = typed_malloc(ScoredExtent, count + 1);
	int resultCount = 0;

	// prune the search by only looking at documents that might contain
	// interesting information; "nextOffsetPossible" contains the next
	// index position at which we can find a query element
	offset nextForTerm[MAX_SCORER_COUNT];
	offset nextOffsetPossible = MAX_OFFSET;
	offset elemStart, elemEnd;
	for (int elem = 0; elem < elementCount; elem++) {
		nextForTerm[elem] = MAX_OFFSET;
		if (elementLists[elem]->getFirstEndBiggerEq(0, &elemStart, &elemEnd)) {
			nextForTerm[elem] = elemEnd;
			nextOffsetPossible = MIN(nextOffsetPossible, elemEnd);
		}
	}

	while (containerList->getFirstEndBiggerEq(nextOffsetPossible, &start, &end)) {
		double docLen = (end - start + 1);
		candidate.from = start;
		candidate.to = end;
		candidate.score = totalTermWeight;

		// compute document score according to the query being generated by the
		// current document's language model
		for (int i = 0; i < elementCount; i++) {
			double tf = (nextForTerm[i] <= end ? elementLists[i]->getCount(start, end) : 0);
			double p_smoothed;
			if (smoothingMethod == SMOOTHING_DIRICHLET)
				p_smoothed = (tf + dirichletMu * p_global[i]) / (docLen + dirichletMu);
			else if (smoothingMethod == SMOOTHING_JELINEK)
				p_smoothed = (1 - jelinekLambda) * (tf / docLen) + jelinekLambda * p_global[i];
			else {
				log(LOG_ERROR, LOG_ID, "Internal error: Undefined smoothing method!");
				exit(1);
			}
			candidate.score += externalWeights[i] * log(MAX(1E-12, p_smoothed));
		}

		addToResultSet(&candidate, &resultCount);

		// find next document that could possibly contain one of the query terms
		if (!containerList->getFirstEndBiggerEq(end + 1, &start, &end))
			break;
		nextOffsetPossible = MAX_OFFSET;
		for (int elem = 0; elem < elementCount; elem++) {
			if (nextForTerm[elem] < start) {
				nextForTerm[elem] = MAX_OFFSET;
				if (elementLists[elem]->getFirstEndBiggerEq(start, &elemStart, &elemEnd))
					nextForTerm[elem] = elemEnd;
			}
			nextOffsetPossible = MIN(nextOffsetPossible, nextForTerm[elem]);
		}
	} // end while (containerList->getFirstEndBiggerEq(nextOffsetPossible, &start, &end))

	count = resultCount;
} // end of processCoreQuery()


void LanguageModelQuery::processModifiers(char **modifiers) {
	RankedQuery::processModifiers(modifiers);
	char *s = getModifierString(modifiers, "smoothing", DEFAULT_SMOOTHING);
	if (strcasecmp(s, "dirichlet") == 0) {
		smoothingMethod = SMOOTHING_DIRICHLET;
		dirichletMu = getModifierDouble(modifiers, "mu", DEFAULT_MU);
	}
	else if ((strcasecmp(s, "jelinek-mercer") == 0) || (strcasecmp(s, "jm") == 0)) {
		smoothingMethod = SMOOTHING_JELINEK;
		jelinekLambda = getModifierDouble(modifiers, "lambda", DEFAULT_LAMBDA);
	}
	else {
		smoothingMethod = SMOOTHING_JELINEK;
		jelinekLambda = 0;
	}
	free(s);
} // end of processModifiers(char**)


